#!/usr/bin/python
# -*- coding: utf-8 -*-

import re
import string

pages = open('/home/chaojiansong/videobook/notinsvn/allpages', 'r')
cat = open('/home/chaojiansong/videobook/notinsvn/open_cats', 'w')
cnt = 1
for line in pages:
    print cnt
    m = re.search('<title>([\s\S]*?)</title>', line)
    if m:
        titles = []
        line = unicode(line, 'cp936', 'ignore').encode('utf-8')

        # 重定向页面
        for mm in re.finditer('href="/view/' + str(cnt) + '\\.html\\?hold=redirect" target="_blank">(.*?)</a>', line):
            titles.append(mm.group(1))

        # 同义词合并页面
        for mm in re.finditer('<a class="nslog:1014" href="/history/id=\\d+" target="_blank">(.*?)</a>和(.*?)是<a class="nslog:1015" href="http://www.baidu.com/search/baike_help.html#同义词"', line):
            titles.append(mm.group(1))
            titles.append(mm.group(2))

        # 当前title
        mm = re.search('(.*)_百度百科', unicode(m.group(1), 'cp936', 'ignore').encode('utf-8'))
        if mm:
            titles.append(mm.group(1))
            title_set = {}.fromkeys(titles).keys()
            print ','.join(title_set)
            for title in title_set:
                cat.write('title: ' + string.lower(title) + '\n')
                for m in re.finditer("<a href='/taglist\\?tag=.*?&tagfromview' target='_blank'>(.*?)</a>", line):
                    cat.write('cat: ' + string.lower(m.group(1)) + '\n')
    cnt = cnt + 1
cat.close()
pages.close()
